Задачей данного ноутбука является построение наилучшей модели посредством экспериментов с разными моделями бустинга, оверсэмплингом, дополнительными признаками, а также дальнейшее построение прототипа экономической модели для оценки применимости машинного обучения для бизнеса.
Подключение диска и загрузка библиотек
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
cd /content/gdrive/'My Drive'/Colab_Notebooks/CHURN_prediction_competition
pip install category_encoders
pip install catboost
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from matplotlib import pyplot as plt
sns.set_style("darkgrid")
import pandas as pd
import numpy as np
from scipy import stats
import itertools
import random
from category_encoders import LeaveOneOutEncoder,TargetEncoder, BinaryEncoder,SumEncoder,BackwardDifferenceEncoder
from sklearn import model_selection, metrics, pipeline, preprocessing,impute
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score,GridSearchCV
from sklearn.feature_selection import SelectFromModel
from imblearn import over_sampling
import gc
from tqdm import tqdm_notebook
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier,Pool
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,classification_report,roc_curve, confusion_matrix,average_precision_score,precision_recall_curve,auc,precision_score,recall_score
Загрузка данных, инициализация функций, эксперименты с моделью
features = pd.read_csv('orange_small_churn_data.train')
labels = pd.read_csv('orange_small_churn_labels.train',header= None,names=['target'])
feats_train,feats_val,labels_train,labels_val = train_test_split(features,labels, test_size = 0.3,\
shuffle=True,random_state=42,\
stratify = labels)
feats_train.head()
test_data = pd.read_csv('orange_small_churn_test_data.csv', index_col='ID')
test_data.head()
def null_map(data):
data_to_heat = data.isnull()
data_to_heat.head()
with plt.xkcd():
plt.figure(figsize=(20,14))
colors = ['#000099', '#ffff00']
sns.heatmap(data_to_heat,cmap = sns.color_palette(colors));
null_map(feats_train)
def feat_classif_clean_nan(data,prop_nan=0.3,len_uniq_num_min=20,len_uniq_min=2):
#Функция первичного отбора (по NaN) и классификации признаков (категориальные или числовые),
#можем менять допустимую долю NaN - prop_nan, а также пороговые количества уникальных значений
#Возвращает списки с названиями столбцов
#Обработка числовых признаков
data_float = data.select_dtypes(include = ['float64'],).copy()
cols_float_cat = []
cols_float_num = []
for col in data_float.columns:
# Удаляем рассмотрения признаки,где доля nan больше prop_nan
if data[col].dropna().shape[0]/data[col].shape[0] > 1-prop_nan:
len_unique = np.unique(data[col].dropna()).shape[0]
if len_unique <= len_uniq_num_min:#порог для отнесения числового признака к категориальным
if len_unique >= len_uniq_min:
cols_float_cat.append(col)
else:
cols_float_num.append(col)
#Обработка нечисловых признаков
data_not_num = data.select_dtypes(include=['object']).copy()
cols_cat=[]
for col in data_not_num.columns.tolist():
# Удаляем из рассмотрения признаки,где доля nan больше prop_nan
if data[col].dropna().shape[0]/data[col].shape[0] > 1-prop_nan:
len_unique = np.unique(data[col].dropna()).shape[0]
if len_unique >= len_uniq_min:
cols_cat.append(col)
num_columns = cols_float_num+['Var73'] #числовые признаки
cat_columns = cols_cat+cols_float_cat #категориальные признаки
return num_columns,cat_columns
def fe_cat_clean_too_high_cardinality(data,cat_columns,max_prop_unique=1):
#Удаляем неинформативные признаки (слишком много категорий,
# доля уникальных из заполненных приближается к max_prop_unique)
#Возвращает список с названиями столбцов
cat_features = data[cat_columns].copy()
cat_columns_new = []
for col in cat_columns:
level_counts = cat_features[col].value_counts()
#Проверяем условие не слишком большого числа категорий (доля от количества объектов - max_prop_unique)
if level_counts.shape[0]/level_counts.sum() < max_prop_unique:
cat_columns_new.append(col)#добавляем признак
return cat_columns_new
def fe_cat_cardinality_classification(data,cat_columns,N_lim=10):
#Классифицируем категориальные признаки по количеству категорий
#Возвращает списки с названиями столбцов
cat_features = data[cat_columns].copy()
low_cardinality_columns = []
high_cardinality_columns = []
for col in cat_columns:
if cat_features[col].unique().shape[0] < N_lim:
low_cardinality_columns.append(col)
else:
high_cardinality_columns.append(col)
return low_cardinality_columns, high_cardinality_columns
def data_preprocessor(data,params):
#return indices fo each category
data=data.copy()
[prop_nan,max_prop_unique,N_lim] = params
numeric_data_columns,categorical_data_columns0 = feat_classif_clean_nan(data,prop_nan=prop_nan)
numeric_data_indices = np.array([(column in numeric_data_columns) for column in data.columns], dtype = bool)
categorical_data_columns = fe_cat_clean_too_high_cardinality(data,categorical_data_columns0,max_prop_unique=max_prop_unique)
low_cardinality_cat_columns ,high_cardinality_cat_columns = fe_cat_cardinality_classification(data,\
categorical_data_columns,N_lim=N_lim)
low_card_cat_data_indices = np.array([(column in low_cardinality_cat_columns) for column in data.columns], dtype = bool)
high_card_cat_data_indices = np.array([(column in high_cardinality_cat_columns) for column in data.columns], dtype = bool)
indices_lists = [numeric_data_indices,low_card_cat_data_indices,high_card_cat_data_indices]
return indices_lists
class MyCatMergerNaInputer(BaseEstimator, TransformerMixin):
def __init__(self,occurrence_med_prop = 0.2,fill_value='NaN'):
self.fill_value = fill_value
self.var_cat_dict = {}
self.occurrence_med_prop = occurrence_med_prop
def fit(self,X,y=None):
df = pd.DataFrame(X)
df = df.fillna(self.fill_value)
df = pd.DataFrame(df,dtype=str)
for col in df.columns:
level_counts = df[col].value_counts()
lvl_c_median_norm = level_counts/level_counts.median()
small_cat_list = lvl_c_median_norm[lvl_c_median_norm < self.occurrence_med_prop].index
self.var_cat_dict[col] = small_cat_list
return self
def transform(self,X,y=None):
df = pd.DataFrame(X)
df = df.fillna(self.fill_value)
df = pd.DataFrame(df,dtype=str)
for col in df.columns:
df[col] = df[col].apply(lambda x: f'{col}_other_cat' if x in self.var_cat_dict[col] else x)
return df.values
def transformation(low_level_transformers,indices_lists):
[numeric_imputer,numeric_scaler,cat_imputer1,cat_imputer2,\
low_cardinality_cat_transformer,high_cardinality_cat_transformer] = low_level_transformers
[numeric_data_indices,low_card_cat_data_indices,high_card_cat_data_indices] = indices_lists
return pipeline.FeatureUnion(transformer_list =[
#numeric
('numeric_features_processing', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, numeric_data_indices])),
('fill_nan',numeric_imputer),
('scaling', numeric_scaler)
])),
#categorical
('cat_features_processing', pipeline.FeatureUnion(transformer_list =[
#number of categories < N_lim
('low_cardinality_transformation', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, low_card_cat_data_indices])),
('fill_nan',cat_imputer1 ),
('low_cardinality_cat_transformer', low_cardinality_cat_transformer)
])),
#number of categories >= N_lim
('high_cardinality_transformation', pipeline.Pipeline(steps = [
('selecting', preprocessing.FunctionTransformer(lambda data: data[:, high_card_cat_data_indices])),
('fill_nan',cat_imputer2 ),
('high_cardinality_cat_transformer', high_cardinality_cat_transformer)
]))
]))
])
class Estimator(BaseEstimator, TransformerMixin):
def __init__(self, transformer,classifier):
self.transformer = transformer
self.classifier = classifier
def fit(self, X, y):
self.transformer.fit(X,y)
X_tr = self.transformer.transform(X)
self.classifier.fit(X_tr,y)
return self.classifier
def predict(self, X):
X_tr = self.transformer.transform(X)
return self.classifier.predict(X_tr)
def predict_proba(self,X):
X_tr = self.transformer.transform(X)
return self.classifier.predict_proba(X_tr)
def pr_plot(y_true,probs, thrshs):
#Функция для визуализации результатов
precisions=[]
recalls=[]
thrshs=thrshs
for threshold in thrshs:
y_pred = [1 if prob > threshold else -1 for prob in probs[:,1]]
precisions.append(precision_score(y_true,y_pred))
recalls.append(recall_score(y_true,y_pred))
plt.plot(thrshs, precisions,label='precision');
plt.plot(thrshs, recalls,label='recall');
plt.xlabel('threshold')
plt.legend()
plt.title('Зависимость значений метрик от порога')
plt.grid(True)
plt.ylabel('metric');
def write_to_submission_file(predicted_labels, out_file,
target='result', index_label="Id"):
# turn predictions into data frame and save as csv file
predicted_df = pd.DataFrame(predicted_labels,
index = np.arange(0, predicted_labels.shape[0]),
columns=[target])
predicted_df.to_csv(out_file, index_label=index_label)
def cat_prep(data,cat_columns):
data = data.copy()
for col in cat_columns:
data[col] = data[col].astype(str)
return data
#Индексы значемых признаков полсле предобработки
indices_lists = data_preprocessor(feats_train,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
#Заполнение пропущенных значений
#Числовые признаки
numeric_imputer = impute.SimpleImputer()#по умолчанию заполняется средними значениями
numeric_scaler = None
#Категориальные признаки
occurrence_med_prop = 0.1#доля размера категории от медианного значения для признака для дальнейшего объединения таких категорий в одну
cat_imputer1 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
cat_imputer2 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
#Методы обработки категориальных признаков
(low_cardinality_cat_transformer1,high_cardinality_cat_transformer1) = (preprocessing.OneHotEncoder(handle_unknown = 'ignore'),
preprocessing.OneHotEncoder(handle_unknown = 'ignore'))
#Список трансформеров низкого уровня
low_level_transformers = [numeric_imputer,numeric_scaler,cat_imputer1,cat_imputer2,\
low_cardinality_cat_transformer1,high_cardinality_cat_transformer1]
#Итоговый трансформер
transformer = transformation(low_level_transformers,indices_lists)
#Классификатор и итоговый алгоритм
clf = xgb.XGBClassifier(random_state=42,n_estimators=100,gamma=0.1,max_depth=3,reg_alpha=1,min_child_weight=1)
estimator = Estimator(transformer,clf)
warnings.filterwarnings('ignore')
estimator.fit(feats_train.values,labels_train.values)
prb = estimator.predict_proba(feats_val.values)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator.predict(feats_val.values)
print(classification_report(labels_val.values,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
#Индексы значимых признаков полсле предобработки
[num_inds, cat_indices1,cat_indices2]= data_preprocessor(feats_train,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
cat_columns = list(np.array(features.columns)[cat_indices1]) + list(np.array(features.columns)[cat_indices2])
print(cat_columns)
num_columns = list(np.array(features.columns)[num_inds])
print(num_columns)
good_columns = num_columns + cat_columns
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=500,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb1.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb2.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bayesian',
bagging_temperature=1,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb4.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=504,random_state=0,
eval_metric = 'AUC',
learning_rate=0.04,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=446,random_state=0,
eval_metric = 'AUC',
depth=7,
learning_rate=0.04,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=446,random_state=0,
eval_metric = 'AUC',
depth=10,
learning_rate=0.04,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=400,random_state=0,
eval_metric = 'AUC',
learning_rate=0.04,
depth=10,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb5.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=446,random_state=0,
eval_metric = 'AUC',
depth=7,
auto_class_weights = 'Balanced',
learning_rate=0.04,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=446,random_state=0,
eval_metric = 'AUC',
depth=7,
auto_class_weights = 'SqrtBalanced',
learning_rate=0.04,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=446,random_state=0,
eval_metric = 'AUC',
depth=7,
auto_class_weights = 'SqrtBalanced',
learning_rate=0.06,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb6.csv')
train_pool = Pool(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns)
np.array(estimator_cb.get_feature_importance(prettified=True))
np.array(estimator_cb.get_feature_importance(train_pool,'LossFunctionChange', prettified=True))
cat_columns2 = cat_columns.copy()
for var in ['Var212','Var202','Var206','Var217','Var199']:
cat_columns2.remove(var)
len(cat_columns2),len(cat_columns)
good_columns2 = num_columns + cat_columns2
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns2],cat_columns2),labels_train, cat_features = cat_columns2,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns2],cat_columns2), labels_val))
estimator_cb.fit(cat_prep(features[good_columns2],cat_columns2), labels, cat_features = cat_columns2,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns2],cat_columns2))
write_to_submission_file(probs[:,1],out_file='submission_cb7.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
train_pool = Pool(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns)
np.array(estimator_cb.get_feature_importance(train_pool,'LossFunctionChange', prettified=True))
cat_columns2 = cat_columns.copy()
for var in ['Var199','Var217']:
cat_columns2.remove(var)
good_columns2 = num_columns + cat_columns2
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=200,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(cat_prep(features[good_columns2],cat_columns2), labels, cat_features = cat_columns2,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns2],cat_columns2))
write_to_submission_file(probs[:,1],out_file='submission_cb8.csv')
sub_names = ['submission_cb1','submission_cb2','submission_cb3','submission_cb4','submission_cb5','submission_cb6','submission_cb7',
'submission_cb8','submission3','submission4','submission5','submission6','submission7','submission8','submission9','submission_final']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df.head()
sub_df['mean'] = sub_df.mean(axis=1)
sub_df.head()
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean1.csv')
#Индексы значемых признаков полсле предобработки
indices_lists = data_preprocessor(feats_train,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
#Заполнение пропущенных значений
#Числовые признаки
numeric_imputer = impute.SimpleImputer()#по умолчанию заполняется средними значениями
numeric_scaler = None
#Категориальные признаки
occurrence_med_prop = 0.1#доля размера категории от медианного значения для признака для дальнейшего объединения таких категорий в одну
cat_imputer1 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
cat_imputer2 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
#Методы обработки категориальных признаков
(low_cardinality_cat_transformer1,high_cardinality_cat_transformer1) = (preprocessing.OneHotEncoder(handle_unknown = 'ignore'),
preprocessing.OneHotEncoder(handle_unknown = 'ignore'))
#Список трансформеров низкого уровня
low_level_transformers = [numeric_imputer,numeric_scaler,cat_imputer1,cat_imputer2,\
low_cardinality_cat_transformer1,high_cardinality_cat_transformer1]
#Итоговый трансформер
transformer = transformation(low_level_transformers,indices_lists)
clf = lgb.LGBMClassifier(random_state=42,n_estimators=100,max_depth=3)
estimator = Estimator(transformer,clf)
warnings.filterwarnings('ignore')
estimator.fit(feats_train.values,labels_train.values)
prb = estimator.predict_proba(feats_val.values)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator.predict(feats_val.values)
print(classification_report(labels_val.values,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
clf = lgb.LGBMClassifier(random_state=42,n_estimators=100,max_depth=3)
estimator = Estimator(transformer,clf)
estimator.fit(features.values, labels.values)
probs = estimator.predict_proba(test_data.values)
write_to_submission_file(probs[:,1],out_file='submission_lgb1.csv')
clf = lgb.LGBMClassifier(random_state=42,n_estimators=200,max_depth=-1, learning_rate=0.05,reg_alpha=0.05,reg_lambda=0.05)
estimator = Estimator(transformer,clf)
warnings.filterwarnings('ignore')
estimator.fit(feats_train.values,labels_train.values)
prb = estimator.predict_proba(feats_val.values)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator.predict(feats_val.values)
print(classification_report(labels_val.values,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
clf = lgb.LGBMClassifier(random_state=42,n_estimators=200,max_depth=-1, learning_rate=0.05,reg_alpha=0.05,reg_lambda=0.05)
estimator = Estimator(transformer,clf)
estimator.fit(features.values, labels.values)
probs = estimator.predict_proba(test_data.values)
write_to_submission_file(probs[:,1],out_file='submission_lgb2.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4,
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
prb = estimator_cb.predict_proba(cat_prep(feats_val[good_columns],cat_columns))
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(cat_prep(feats_val[good_columns],cat_columns))
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=120,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4,
l2_leaf_reg=0.1
)
estimator_cb.fit(cat_prep(feats_train[good_columns],cat_columns), labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (cat_prep(feats_val[good_columns],cat_columns), labels_val))
prb = estimator_cb.predict_proba(cat_prep(feats_val[good_columns],cat_columns))
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(cat_prep(feats_val[good_columns],cat_columns))
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=120,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4,
l2_leaf_reg=0.1
)
estimator_cb.fit(cat_prep(features[good_columns],cat_columns), labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(cat_prep(test_data[good_columns],cat_columns))
write_to_submission_file(probs[:,1],out_file='submission_cb9.csv')
def normed_fe_interaction(dataset,level=3, max_feats = 10,num_columns=[]):
#Создает признаки на основе комбинаций существующих
dataset = dataset.copy()
data = dataset[num_columns]
fe_list = []
for comb in list(itertools.combinations(data.columns,level))[:max_feats]:
if level==2:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
feature = a*b
feature.name = f'{comb[0]},{comb[1]}_mix'
elif level==3:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
feature = a*b + a*c + b*c
feature.name = f'{comb[0]},{comb[1]},{comb[2]}_mix'
elif level==4:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
d = (data[comb[3]] - data[comb[3]].mean())/data[comb[3]].std()
feature = a*b + a*c + a*d + b*d + b*c + c*d
feature.name = f'{comb[0]},{comb[1]},{comb[2]},{comb[3]}_mix'
elif level==5:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
d = (data[comb[3]] - data[comb[3]].mean())/data[comb[3]].std()
e = (data[comb[4]] - data[comb[4]].mean())/data[comb[4]].std()
feature = a*b + a*c + a*d +a*e + b*c + b*d + b*e + c*d + c*e + d*e
feature.name = f'{comb[0]},{comb[1]},{comb[2]},{comb[3]},{comb[4]}_mix'
fe_list.append(pd.DataFrame(feature))
if fe_list==[]:
features = pd.DataFrame()
else:
features = pd.concat(fe_list,axis=1)
res_dataframe = pd.concat([dataset,features],axis=1)
return res_dataframe
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction(feats_train[good_columns],level=3, max_feats = 100,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction(feats_val[good_columns],level=3, max_feats = 100,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=2, max_feats = 100,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=2, max_feats = 100,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe1.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=3, max_feats = 100,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=3, max_feats = 100,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe2.csv')
class OverSamplerEstimator2(BaseEstimator, TransformerMixin):
def __init__(self, transformer,oversampler,classifier):
self.transformer = transformer
self.oversampler = oversampler
self.classifier = classifier
def fit(self, X, y):
self.transformer.fit(X,y)
X_tr = self.transformer.transform(X)
X_resampled, y_resampled = self.oversampler.fit_resample(X_tr,y)
self.classifier.fit(X_resampled, y_resampled)
return self.classifier
def predict(self, X):
X_tr = self.transformer.transform(X)
return self.classifier.predict(X_tr)
def predict_proba(self,X):
X_tr = self.transformer.transform(X)
return self.classifier.predict_proba(X_tr)
oversampler_1 = over_sampling.SMOTE()
clf_res = xgb.XGBClassifier(random_state=42,n_estimators=100,gamma=0.1,max_depth=3,reg_alpha=1,min_child_weight=1)
estimator = OverSamplerEstimator2(transformer,oversampler_1,clf_res)
estimator.fit(features.values, labels.values)
probs = estimator.predict_proba(test_data.values)
write_to_submission_file(probs[:,1],out_file='submission_overs1.csv')
oversampler_2 = over_sampling.RandomOverSampler(sampling_strategy=0.5,random_state=0)
clf_res = xgb.XGBClassifier(random_state=42,n_estimators=100,gamma=0.1,max_depth=3,reg_alpha=1,min_child_weight=1)
estimator = OverSamplerEstimator2(transformer,oversampler_2,clf_res)
estimator.fit(features.values, labels.values)
probs = estimator.predict_proba(test_data.values)
write_to_submission_file(probs[:,1],out_file='submission_overs2.csv')
%%time
data_train = normed_fe_interaction(features,level=2, max_feats = 100,num_columns=num_columns)
data_test = normed_fe_interaction(test_data,level=2, max_feats = 100,num_columns=num_columns)
indices_lists = data_preprocessor(data_train ,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
numeric_imputer = impute.SimpleImputer()#по умолчанию заполняется средними значениями
numeric_scaler = None
occurrence_med_prop = 0.1#доля размера категории от медианного значения для признака для дальнейшего объединения таких категорий в одну
cat_imputer1 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
cat_imputer2 = MyCatMergerNaInputer(occurrence_med_prop = occurrence_med_prop,fill_value='Hi')
(low_cardinality_cat_transformer1,high_cardinality_cat_transformer1) = (preprocessing.OneHotEncoder(handle_unknown = 'ignore'),
preprocessing.OneHotEncoder(handle_unknown = 'ignore'))
low_level_transformers = [numeric_imputer,numeric_scaler,cat_imputer1,cat_imputer2,\
low_cardinality_cat_transformer1,high_cardinality_cat_transformer1]
transformer = transformation(low_level_transformers,indices_lists)
oversampler_3 = over_sampling.SMOTE(sampling_strategy=0.2,random_state=0)
clf_res = lgb.LGBMClassifier(random_state=42,n_estimators=100,max_depth=3)
estimator = OverSamplerEstimator2(transformer,oversampler_3,clf_res)
estimator.fit(data_train.values, labels.values)
probs = estimator.predict_proba(data_test.values)
write_to_submission_file(probs[:,1],out_file='submission_overs3_lgb.csv')
oversampler_4 = over_sampling.RandomOverSampler(sampling_strategy=0.2,random_state=0)
clf_res = lgb.LGBMClassifier(random_state=42,n_estimators=100,max_depth=3)
estimator = OverSamplerEstimator2(transformer,oversampler_4,clf_res)
estimator.fit(data_train.values, labels.values)
probs = estimator.predict_proba(data_test.values)
write_to_submission_file(probs[:,1],out_file='submission_overs4_lgb.csv')
sub_names = ['submission_cb1','submission_cb2','submission_cb3','submission_cb4','submission_cb5','submission_cb6','submission_cb7',
'submission_cb8','submission3','submission4','submission5','submission6','submission7','submission8','submission9','submission_final',
'submission_cb9','submission_cb_new_fe1','submission_cb_new_fe2','submission_lgb1','submission_lgb2','submission_mean1',
'submission_overs1','submission_overs2','submission_overs3_lgb','submission_overs4_lgb']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df.head()
sub_df['mean'] = sub_df.mean(axis=1)
sub_df.head()
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean2.csv')
sub_names = ['submission_cb1','submission_cb2','submission_cb3','submission_cb4','submission_cb5','submission_cb6','submission_cb7',
'submission3','submission4','submission9','submission_final',
'submission_cb9','submission_cb_new_fe1','submission_cb_new_fe2','submission_lgb1','submission_lgb2','submission_mean1',
'submission_overs2','submission_mean2']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean3.csv')
sub_names = ['submission_cb2','submission_cb3','submission_cb5','submission_cb6','submission_cb_new_fe1','submission_cb_new_fe2','submission_mean1',
'submission_mean2','submission_mean3']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean4.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean6.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_mean6','submission_mean7']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean8.csv')
data0 = cat_prep(normed_fe_interaction(features[good_columns],level=2, max_feats = 100,num_columns=num_columns),cat_columns)
null_map(data0)
data0_nafilled = data0.fillna('NANANA')
null_map(data0_nafilled)
oversampler_5 = over_sampling.RandomOverSampler(sampling_strategy=0.15,random_state=0)
data10,labels1 = oversampler_5.fit_resample(data0_nafilled,labels)
data11 = pd.DataFrame(data10,columns = data0.columns)
data12 = data11.copy()
data12.head()
for col in data12.columns:
data12.at[data12[col]=='NANANA',col] = np.nan
data12.head()
null_map(data12)
data12.shape
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=2, max_feats = 100,num_columns=num_columns),cat_columns)
data1 = data12.copy()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data1, labels1, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe1_resampled.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction(feats_train[good_columns],level=2, max_feats = 200,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction(feats_val[good_columns],level=2, max_feats = 200,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=300,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=2, max_feats = 200,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=2, max_feats = 200,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe3.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction(feats_train[good_columns],level=3, max_feats = 300,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction(feats_val[good_columns],level=3, max_feats = 300,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=180,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=3, max_feats = 300,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=3, max_feats = 300,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe4.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction(feats_train[good_columns],level=4, max_feats = 300,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction(feats_val[good_columns],level=4, max_feats = 300,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=250,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=4, max_feats = 300,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=4, max_feats = 300,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe5.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction(feats_train[good_columns],level=5, max_feats = 300,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction(feats_val[good_columns],level=5, max_feats = 300,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=250,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction(features[good_columns],level=5, max_feats = 400,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction(test_data[good_columns],level=5, max_feats = 400,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe6.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean9.csv')
[num_inds, cat_indices1,cat_indices2]= data_preprocessor(feats_train,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
cat_columns0 = list(np.array(features.columns)[cat_indices1]) + list(np.array(features.columns)[cat_indices2])
num_columns0 = list(np.array(features.columns)[num_inds])
good_columns0 = num_columns0 + cat_columns0
print(num_columns0)
data_train0 = cat_prep(normed_fe_interaction(feats_train[good_columns0],level=2, max_feats = 60,num_columns=num_columns0),cat_columns0)
data_val0 = cat_prep(normed_fe_interaction(feats_val[good_columns0],level=2, max_feats = 60,num_columns=num_columns0),cat_columns0)
[num_inds, cat_indices1,cat_indices2]= data_preprocessor(data_train0,params=[0.7, 0.5, 30])# params = [prop_nan,max_prop_unique,N_lim]
cat_columns1 = list(np.array(data_train0.columns)[cat_indices1]) + list(np.array(data_train0.columns)[cat_indices2])
num_columns1 = list(np.array(data_train0.columns)[num_inds])
good_columns1 = num_columns1 + cat_columns1
print(num_columns1)
data_train1 = cat_prep(normed_fe_interaction(data_train0[good_columns1],level=5, max_feats = 150,num_columns=num_columns1),cat_columns1)
data_val1 = cat_prep(normed_fe_interaction(data_val0[good_columns1],level=5, max_feats = 150,num_columns=num_columns1),cat_columns1)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train1, labels_train, cat_features = cat_columns1,verbose=10, plot=True,
eval_set = (data_val1, labels_val))
prb = estimator_cb.predict_proba(data_val1)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val1)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
data0 = cat_prep(normed_fe_interaction(features[good_columns0],level=2, max_feats = 70,num_columns=num_columns0),cat_columns0)
data_test0 = cat_prep(normed_fe_interaction(test_data[good_columns0],level=2, max_feats = 70,num_columns=num_columns0),cat_columns0)
data1 = cat_prep(normed_fe_interaction(data0[good_columns1],level=5, max_feats = 150,num_columns=num_columns1),cat_columns1)
data_test1 = cat_prep(normed_fe_interaction(data_test0[good_columns1],level=5, max_feats = 150,num_columns=num_columns1),cat_columns1)
data1.columns
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data1, labels, cat_features = cat_columns1,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test1)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe7.csv')
def normed_fe_interaction_2_3_4_5(dataset, max_feats = 10,num_columns=[]):
#Создает признаки на основе комбинаций существующих
dataset = dataset.copy()
data = dataset[num_columns]
fe_list = []
for comb in list(itertools.combinations(data.columns,2))[:max_feats]:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
feature = a*b
feature.name = f'{comb[0]},{comb[1]}_mix'
fe_list.append(pd.DataFrame(feature))
for comb in list(itertools.combinations(data.columns,3))[:max_feats]:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
feature = a*b + a*c + b*c
feature.name = f'{comb[0]},{comb[1]},{comb[2]}_mix'
fe_list.append(pd.DataFrame(feature))
for comb in list(itertools.combinations(data.columns,4))[:max_feats]:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
d = (data[comb[3]] - data[comb[3]].mean())/data[comb[3]].std()
feature = a*b + a*c + a*d + b*d + b*c + c*d
feature.name = f'{comb[0]},{comb[1]},{comb[2]},{comb[3]}_mix'
fe_list.append(pd.DataFrame(feature))
for comb in list(itertools.combinations(data.columns,5))[:max_feats]:
a = (data[comb[0]] - data[comb[0]].mean())/data[comb[0]].std()
b = (data[comb[1]] - data[comb[1]].mean())/data[comb[1]].std()
c = (data[comb[2]] - data[comb[2]].mean())/data[comb[2]].std()
d = (data[comb[3]] - data[comb[3]].mean())/data[comb[3]].std()
e = (data[comb[4]] - data[comb[4]].mean())/data[comb[4]].std()
feature = a*b + a*c + a*d +a*e + b*c + b*d + b*e + c*d + c*e + d*e
feature.name = f'{comb[0]},{comb[1]},{comb[2]},{comb[3]},{comb[4]}_mix'
fe_list.append(pd.DataFrame(feature))
if fe_list==[]:
features = pd.DataFrame()
else:
features = pd.concat(fe_list,axis=1)
res_dataframe = pd.concat([dataset,features],axis=1)
return res_dataframe
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data_train = cat_prep(normed_fe_interaction_2_3_4_5(feats_train[good_columns], max_feats = 100,num_columns=num_columns),cat_columns)
data_val = cat_prep(normed_fe_interaction_2_3_4_5(feats_val[good_columns], max_feats = 100,num_columns=num_columns),cat_columns)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction_2_3_4_5(features[good_columns], max_feats = 100,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction_2_3_4_5(test_data[good_columns], max_feats = 100,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe8.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=350,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction_2_3_4_5(features[good_columns], max_feats = 50,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction_2_3_4_5(test_data[good_columns], max_feats = 50,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe9.csv')
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=450,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
data = cat_prep(normed_fe_interaction_2_3_4_5(features[good_columns], max_feats = 200,num_columns=num_columns),cat_columns)
data_test = cat_prep(normed_fe_interaction_2_3_4_5(test_data[good_columns], max_feats = 200,num_columns=num_columns),cat_columns)
estimator_cb.fit(data, labels, cat_features = cat_columns,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe10.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8','submission_mean9',
'submission_cb_new_fe8','submission_cb_new_fe9']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean10.csv')
def categ_fe_interaction(dataset, max_feats = 10,cat_columns=[]):
#Создает признаки на основе парных комбинаций существующих
dataset = dataset.copy()
data = dataset[cat_columns]
fe_list = []
for comb in list(itertools.combinations(data.columns,2))[:max_feats]:
feature = data[comb[0]].astype(str) + data[comb[1]].astype(str)
feature.name = f'{comb[0]},{comb[1]}_mix'
fe_list.append(pd.DataFrame(feature))
if fe_list==[]:
features = pd.DataFrame()
else:
features = pd.concat(fe_list,axis=1)
res_dataframe = pd.concat([dataset,features],axis=1)
cat_columns_new = cat_columns + list(features.columns)
return res_dataframe,cat_columns_new
def metric_estimate_prediction(model,data_val,labels_val,cat_columns):
prb = model.predict_proba(data_val)
metric = roc_auc_score(labels_val.values,prb[:,1])
return metric
def feature_selection_loop_prediction(model,data_val,labels_val,cat_columns,threshold=1e-4):
data_val = data_val.copy()
metric_start = metric_estimate_prediction(model,data_val,labels_val,cat_columns)
good_cat_columns = []
good_num_columns = []
diff_metric_without_fe = {}
for col in tqdm_notebook(data_val.columns):
data_permuted = data_val.copy()
data_permuted[col] = np.random.permutation(data_permuted[col].values)
metric_fe_out = metric_estimate_prediction(model,data_permuted,labels_val,cat_columns)
diff_metric = metric_fe_out - metric_start
diff_metric_without_fe[col] = diff_metric
print('----------')
print(f'{col} diff_metric: {diff_metric}')
print('----------')
if diff_metric < -threshold*metric_start:
if col in cat_columns:
good_cat_columns.append(col)
print('----------')
print(f'good_cat_columns: {good_cat_columns}')
else:
good_num_columns.append(col)
print('----------')
print(f'good_num_columns: {good_num_columns}')
gc.collect()
return good_num_columns,good_cat_columns,diff_metric_without_fe
data_plus_new_num_train = normed_fe_interaction(feats_train[good_columns],level=2, max_feats = 100,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction(feats_val[good_columns],level=2, max_feats = 100,num_columns=num_columns)
data_plus_num_cat_train,cat_columns_new = categ_fe_interaction(data_plus_new_num_train, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_val = categ_fe_interaction(data_plus_new_num_val, max_feats = 100,cat_columns=cat_columns)[0]
data_train = cat_prep(data_plus_num_cat_train,cat_columns_new)
data_val = cat_prep(data_plus_num_cat_val,cat_columns_new)
data_train.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns_new,verbose=10, plot=True,
eval_set = (data_val, labels_val))
prb = estimator_cb.predict_proba(data_val)
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val)
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
data_plus_new_num = normed_fe_interaction(features[good_columns],level=2, max_feats = 100,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction(test_data[good_columns],level=2, max_feats = 100,num_columns=num_columns)
data_plus_num_cat,cat_columns_new = categ_fe_interaction(data_plus_new_num, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_test = categ_fe_interaction(data_plus_new_num_test, max_feats = 100,cat_columns=cat_columns)[0]
data = cat_prep(data_plus_num_cat,cat_columns_new)
data_test = cat_prep(data_plus_num_cat_test,cat_columns_new)
data.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data, labels, cat_features = cat_columns_new,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test)
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final1.csv')
data_plus_new_num_train = normed_fe_interaction_2_3_4_5(feats_train[good_columns],max_feats = 100,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction_2_3_4_5(feats_val[good_columns],max_feats = 100,num_columns=num_columns)
data_plus_num_cat_train,cat_columns_new = categ_fe_interaction(data_plus_new_num_train, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_val = categ_fe_interaction(data_plus_new_num_val, max_feats = 100,cat_columns=cat_columns)[0]
data_train = cat_prep(data_plus_num_cat_train,cat_columns_new)
data_val = cat_prep(data_plus_num_cat_val,cat_columns_new)
data_train.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns_new,verbose=10, plot=True,
eval_set = (data_val, labels_val))
import gc
feature_selection_loop_prediction(estimator_cb,data_val,labels_val,cat_columns_new,threshold=1e-4)
num_cols_selected = ['Var13',
'Var73',
'Var74',
'Var81',
'Var113',
'Var126',
'Var189',
'Var6,Var126_mix',
'Var13,Var28_mix',
'Var13,Var38_mix',
'Var13,Var109_mix',
'Var13,Var126_mix',
'Var13,Var189_mix',
'Var21,Var126_mix',
'Var21,Var133_mix',
'Var6,Var21,Var153_mix',
'Var6,Var24,Var28_mix',
'Var6,Var24,Var76_mix',
'Var6,Var13,Var24,Var126_mix',
'Var6,Var13,Var25,Var28_mix',
'Var6,Var13,Var25,Var57_mix',
'Var6,Var13,Var21,Var24,Var126_mix',
'Var6,Var13,Var21,Var28,Var126_mix']
cat_cols_selected = ['Var132',
'Var205',
'Var206',
'Var210',
'Var219',
'Var192',
'Var199',
'Var204',
'Var217',
'Var7,Var65_mix',
'Var7,Var218_mix',
'Var7,Var219_mix',
'Var7,Var221_mix',
'Var7,Var192_mix',
'Var7,Var193_mix',
'Var7,Var199_mix',
'Var7,Var202_mix',
'Var7,Var212_mix',
'Var7,Var216_mix',
'Var7,Var217_mix',
'Var7,Var220_mix',
'Var35,Var181_mix',
'Var35,Var205_mix',
'Var35,Var208_mix',
'Var35,Var210_mix',
'Var35,Var218_mix',
'Var35,Var219_mix',
'Var35,Var225_mix',
'Var35,Var229_mix',
'Var35,Var192_mix',
'Var35,Var193_mix',
'Var35,Var199_mix',
'Var35,Var202_mix',
'Var35,Var222_mix',
'Var35,Var228_mix',
'Var44,Var78_mix',
'Var44,Var203_mix',
'Var44,Var206_mix',
'Var44,Var210_mix',
'Var44,Var218_mix',
'Var44,Var219_mix',
'Var44,Var223_mix']
good_columns2 = cat_cols_selected + num_cols_selected
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train[good_columns2], labels_train, cat_features = cat_cols_selected,verbose=10, plot=True,
eval_set = (data_val[good_columns2], labels_val))
prb = estimator_cb.predict_proba(data_val[good_columns2])
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val[good_columns2])
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
data_plus_new_num = normed_fe_interaction_2_3_4_5(features[good_columns], max_feats = 100,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction_2_3_4_5(test_data[good_columns], max_feats = 100,num_columns=num_columns)
data_plus_num_cat,cat_columns_new = categ_fe_interaction(data_plus_new_num, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_test = categ_fe_interaction(data_plus_new_num_test, max_feats = 100,cat_columns=cat_columns)[0]
data = cat_prep(data_plus_num_cat,cat_columns_new)
data_test = cat_prep(data_plus_num_cat_test,cat_columns_new)
data.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=210,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data[good_columns2], labels, cat_features = cat_cols_selected,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test[good_columns2])
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final2_selected_fe.csv')
data_plus_new_num_train = normed_fe_interaction_2_3_4_5(feats_train[good_columns],max_feats = 100,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction_2_3_4_5(feats_val[good_columns],max_feats = 100,num_columns=num_columns)
data_plus_num_cat_train,cat_columns_new = categ_fe_interaction(data_plus_new_num_train, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_val = categ_fe_interaction(data_plus_new_num_val, max_feats = 100,cat_columns=cat_columns)[0]
data_train = cat_prep(data_plus_num_cat_train,cat_columns_new)
data_val = cat_prep(data_plus_num_cat_val,cat_columns_new)
data_train.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns_new,verbose=10, plot=True,
eval_set = (data_val, labels_val))
num_cols_selected, cat_cols_selected ,importances = feature_selection_loop_prediction(estimator_cb,data_val,labels_val,cat_columns_new,threshold=0)
good_columns2 = cat_cols_selected + num_cols_selected
len(good_columns2)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train[good_columns2], labels_train, cat_features = cat_cols_selected,verbose=10, plot=True,
eval_set = (data_val[good_columns2], labels_val))
prb = estimator_cb.predict_proba(data_val[good_columns2])
print(f'ROC AUC: {roc_auc_score(labels_val.values,prb[:,1])}')
print(f'PRC AUC: {average_precision_score(labels_val.values,prb[:,1])}')
pred = estimator_cb.predict(data_val[good_columns2])
print(classification_report(labels_val,pred))
print('------------------')
print('------------------')
pr_plot(labels_val.values,prb,[0.05,0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8,0.9,0.95,0.96,0.97,0.99])
data_plus_new_num = normed_fe_interaction_2_3_4_5(features[good_columns], max_feats = 100,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction_2_3_4_5(test_data[good_columns], max_feats = 100,num_columns=num_columns)
data_plus_num_cat,cat_columns_new = categ_fe_interaction(data_plus_new_num, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_test = categ_fe_interaction(data_plus_new_num_test, max_feats = 100,cat_columns=cat_columns)[0]
data = cat_prep(data_plus_num_cat,cat_columns_new)
data_test = cat_prep(data_plus_num_cat_test,cat_columns_new)
data.head()
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=400,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data[good_columns2], labels, cat_features = cat_cols_selected,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test[good_columns2])
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final2_selected_fe2.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8','submission_mean10']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean11.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8','submission_mean10','submission_mean11','submission_cb_new_fe_final2_selected_fe2']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean12.csv')
#Индексы значимых признаков полсле предобработки
[num_inds, cat_indices1,cat_indices2]= data_preprocessor(feats_train,params=[0.9, 0.7, 30])# params = [prop_nan,max_prop_unique,N_lim]
cat_columns = list(np.array(features.columns)[cat_indices1]) + list(np.array(features.columns)[cat_indices2])
num_columns = list(np.array(features.columns)[num_inds])
good_columns = num_columns + cat_columns
data_plus_new_num_train = normed_fe_interaction(feats_train[good_columns],level =2 ,max_feats = 100,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction(feats_val[good_columns],level =2, max_feats = 100,num_columns=num_columns)
data_train = cat_prep(data_plus_new_num_train,cat_columns)
data_val = cat_prep(data_plus_new_num_val,cat_columns)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
num_cols_selected, cat_cols_selected ,importances = feature_selection_loop_prediction(estimator_cb,data_val,labels_val,cat_columns,threshold=0)
good_columns2 = num_cols_selected + cat_cols_selected
data_plus_new_num = normed_fe_interaction(features[good_columns],level =2 ,max_feats = 100,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction(test_data[good_columns],level =2, max_feats = 100,num_columns=num_columns)
data = cat_prep(data_plus_new_num ,cat_columns)
data_test = cat_prep(data_plus_new_num_test,cat_columns)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data[good_columns2], labels, cat_features = cat_cols_selected,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test[good_columns2])
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final3_selected_fe3.csv')
data_plus_new_num_train = normed_fe_interaction(feats_train[good_columns],level =2 ,max_feats = 100,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction(feats_val[good_columns],level =2, max_feats = 100,num_columns=num_columns)
data_plus_num_cat_train,cat_columns_new = categ_fe_interaction(data_plus_new_num_train, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_val = categ_fe_interaction(data_plus_new_num_val, max_feats = 100,cat_columns=cat_columns)[0]
data_train = cat_prep(data_plus_num_cat_train,cat_columns_new)
data_val = cat_prep(data_plus_num_cat_val,cat_columns_new)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns_new,verbose=10, plot=True,
eval_set = (data_val, labels_val))
num_cols_selected, cat_cols_selected ,importances = feature_selection_loop_prediction(estimator_cb,data_val,labels_val,cat_columns_new,threshold=0)
good_columns2 = num_cols_selected + cat_cols_selected
data_plus_new_num = normed_fe_interaction(features[good_columns],level =2 ,max_feats = 100,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction(test_data[good_columns],level =2, max_feats = 100,num_columns=num_columns)
data_plus_num_cat,cat_columns_new = categ_fe_interaction(data_plus_new_num, max_feats = 100,cat_columns=cat_columns)
data_plus_num_cat_test = categ_fe_interaction(data_plus_new_num_test, max_feats = 100,cat_columns=cat_columns)[0]
data = cat_prep(data_plus_num_cat,cat_columns_new)
data_test = cat_prep(data_plus_num_cat_test,cat_columns_new)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data[good_columns2], labels, cat_features = cat_cols_selected,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test[good_columns2])
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final3_selected_fe4.csv')
#Индексы значимых признаков полсле предобработки
[num_inds, cat_indices1,cat_indices2]= data_preprocessor(feats_train,params=[0.99, 0.99, 30])# params = [prop_nan,max_prop_unique,N_lim]
cat_columns = list(np.array(features.columns)[cat_indices1]) + list(np.array(features.columns)[cat_indices2])
num_columns = list(np.array(features.columns)[num_inds])
good_columns = num_columns + cat_columns
data_plus_new_num_train = normed_fe_interaction(feats_train[good_columns],level =2 ,max_feats = 150,num_columns=num_columns)
data_plus_new_num_val = normed_fe_interaction(feats_val[good_columns],level =2, max_feats = 150,num_columns=num_columns)
data_train = cat_prep(data_plus_new_num_train,cat_columns)
data_val = cat_prep(data_plus_new_num_val,cat_columns)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data_train, labels_train, cat_features = cat_columns,verbose=10, plot=True,
eval_set = (data_val, labels_val))
num_cols_selected, cat_cols_selected ,importances = feature_selection_loop_prediction(estimator_cb,data_val,labels_val,cat_columns,threshold=0)
good_columns2 = num_cols_selected + cat_cols_selected
data_plus_new_num = normed_fe_interaction(features[good_columns],level =2 ,max_feats = 150,num_columns=num_columns)
data_plus_new_num_test = normed_fe_interaction(test_data[good_columns],level =2, max_feats = 150,num_columns=num_columns)
data = cat_prep(data_plus_new_num ,cat_columns)
data_test = cat_prep(data_plus_new_num_test,cat_columns)
estimator_cb = CatBoostClassifier(task_type='GPU',iterations=380,random_state=0,
eval_metric = 'AUC',
learning_rate=0.08,
boosting_type = 'Ordered',
bootstrap_type='Bernoulli',
subsample=0.8,
one_hot_max_size=10,
leaf_estimation_iterations=10,
max_ctr_complexity=4
)
estimator_cb.fit(data[good_columns2], labels, cat_features = cat_cols_selected,verbose=10, plot=True)
probs = estimator_cb.predict_proba(data_test[good_columns2])
write_to_submission_file(probs[:,1],out_file='submission_cb_new_fe_final3_selected_fe5.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8','submission_mean10','submission_mean11',
'submission_cb_new_fe_final2_selected_fe2','submission_cb_new_fe_final3_selected_fe3','submission_mean12']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean13.csv')
sub_names = ['submission_cb_new_fe1','submission_mean3','submission_mean4','submission_mean5','submission_cb_new_fe6',
'submission_mean6','submission_mean7','submission_mean8','submission_mean10','submission_mean11',
'submission_cb_new_fe_final2_selected_fe2','submission_cb_new_fe_final3_selected_fe3','submission_mean12',
'submission_cb_new_fe_final3_selected_fe5','submission_cb_new_fe9','submission_cb_new_fe5',
'submission_cb_new_fe4','submission_cb_new_fe3','submission_mean2','submission_cb_new_fe2',
'submission_mean1','submission_cb6','submission_cb5','submission_cb3','submission_mean13']
submission_list = []
for submission_name in sub_names:
sub = pd.read_csv(f'{submission_name}.csv',index_col=0)
submission_list.append(sub)
sub_df = pd.concat(submission_list,axis=1)
sub_df['mean'] = sub_df.mean(axis=1)
write_to_submission_file(sub_df[['mean']].values,out_file='submission_mean14.csv')
Итак, после множества экспериментов с моделью, признаками и усреднением, победил Catboost c использованием множества дополнительных признаков взаимодействия, также с учетом отбора, и в конце с усреднение нескольких ответов. Место на данный момент 4, метрика 0.72909.
Теперь можно придумать простую схему работы с потенциально уходящими клиентами. Сделаем предварительную оценку эффекта.
# Метрики алгоритма
precision = 0.3
recall=0.3
# Параметры компании
n_clients = 1e6
av_revenue_per_client = 500
av_churn_prop = 0.05
# Параметры предложения
discount = 0.05
return_rate = 0.4
# Экономическая модель
n_activations = n_clients*av_churn_prop*recall/precision #примерное количество срабатываний алгоритма
revenue_from_loyal_clients = n_clients*(1-av_churn_prop)*av_revenue_per_client
revenue_from_returned_clients = n_clients*av_churn_prop*av_revenue_per_client*(1-discount)*recall*return_rate
discount_inacc_loss = n_activations*(1-precision)*av_revenue_per_client*discount
# Доход с учетом модели
total_revenue = revenue_from_loyal_clients + revenue_from_returned_clients - discount_inacc_loss
#Экономический эффект от использования модели
model_gain = total_revenue - revenue_from_loyal_clients
model_gain_percent = model_gain/revenue_from_loyal_clients * 100
print(f'Выгода от использования модели: {model_gain/1e6} млн руб./месяц')
print(f'Относительная выгода: {round(model_gain_percent,3)} %')
Видно, что при использовании модели имеется экономический эффект. Зададим функцию для расчета эффекта и посмотрим, как он будет меняться при изменении параметров модели и предложения. Будем считать, что доля уходящих клиентов известна из средней доли за предыдущие периоды.
def usage_efficiency_est(precision,recall,n_clients,av_revenue_per_client,av_churn_prop,discount,return_rate):
n_activations = n_clients*av_churn_prop*recall/precision #примерное количество срабатываний алгоритма
revenue_from_loyal_clients = n_clients*(1-av_churn_prop)*av_revenue_per_client
revenue_from_returned_clients = n_clients*av_churn_prop*av_revenue_per_client*(1-discount)*recall*return_rate
discount_inacc_loss = n_activations*(1-precision)*av_revenue_per_client*discount
# Доход с учетом модели
total_revenue = revenue_from_loyal_clients + revenue_from_returned_clients - discount_inacc_loss
#Экономический эффект от использования модели
model_gain = total_revenue - revenue_from_loyal_clients
model_gain_percent = model_gain/revenue_from_loyal_clients * 100
return model_gain,model_gain_percent
Оценим долю возвращающихся клиентов при использовании скидки, как min(1, 5*discount). Чтобы обосновать данную зависимость, можно провести опросы среди клиентов. При дальнейшем использовании данная оценка может быть уточнена.
discounts = np.linspace(0.05,0.3,50)
effects = list(map(lambda discount: usage_efficiency_est(0.3,0.3,1e6,500,0.06,discount,min(1,5*discount))[1],discounts))
plt.plot(discounts,effects)
plt.xlabel('discount proportion')
plt.ylabel('percent of effect');
trsh =[0.05,0.1,0.15,0.35,0.5]
precisions = [0.1,0.2,0.3,0.4,0.6]
recalls = [0.6,0.5,0.3,0.1,0.01]
model_params = list(zip(precisions,recalls))
effects = list(map(lambda param: usage_efficiency_est(param[0],param[1],1e6,500,0.06,0.15,0.75)[1],model_params))
plt.plot(trsh,effects)
plt.xlabel('threshold')
plt.ylabel('percent of effect');
Итак, для лучшей модели и доле потенциально уходящих клиентов 6%, в первом приближении оптимальными получились порог отнесения к оттоку в 0.15 (пороговая вероятность оттока в 15%) и предложение скидки в 15-20%. При таких параметрах относительная выгода при использовании модели больше 0.5%, что при 500 млн руб. дохода составит более 2,5 млн руб. в месяц. В качестве улучшения, можно давать скидку пропорционально вероятности оттока, тогда, в случае если средняя вероятность оттока для клиентов будет меньше той, которая соответствует скидке в 15-20%, будет экономия при тех же значениях доли возвращающихся клиентов.